******************************************************************************************************************************************* fragment
*************************************** SUMMARIZING CLAIMS DATA 
******************************************************************************************************************************************* fragment
clear all  
set more off
set matsize 11000 
cd "C:\Users\sfleit2\Documents\ReclassificationRisk_DoFiles\"

********** create temp files per each of the claims data 

forvalues j = 1/8 {

cd  "C:\PlanChoice\"  
use mc_2015_medclaims`j'_8.dta, clear  


*********** NEW VARIABLES FOR RANDOM FOREST ***********************************************

gen inpatient    = 1 if pl_of_srvc_sys_id==5 | pl_of_srvc_sys_id==19 | pl_of_srvc_sys_id==23 | pl_of_srvc_sys_id==32 | pl_of_srvc_sys_id==46 
replace inpatient=0 if inpatient==.
gen outpatient   = 1 if pl_of_srvc_sys_id==3 | pl_of_srvc_sys_id==7 | pl_of_srvc_sys_id==12 | pl_of_srvc_sys_id==15 | pl_of_srvc_sys_id==24 | pl_of_srvc_sys_id==40 | pl_of_srvc_sys_id==42 | pl_of_srvc_sys_id==43 | pl_of_srvc_sys_id==44 | pl_of_srvc_sys_id==45 | pl_of_srvc_sys_id==47 | pl_of_srvc_sys_id==84 | pl_of_srvc_sys_id==97 | pl_of_srvc_sys_id==808 | pl_of_srvc_sys_id==931
replace outpatient=0 if outpatient==.
gen specialty    = 1 if pl_of_srvc_sys_id==1 | pl_of_srvc_sys_id==13 | pl_of_srvc_sys_id==57 | pl_of_srvc_sys_id==58 | pl_of_srvc_sys_id==71 | pl_of_srvc_sys_id==213 | pl_of_srvc_sys_id==390 
replace specialty=0 if specialty==.
gen other_claims = 1 - inpatient - outpatient - specialty

gen dollar_claims_inpatient     = inpatient  * allw_amt
gen dollar_claims_outpatient    = outpatient * allw_amt
gen dollar_claims_specialty     = specialty  * allw_amt
gen dollar_claims_other_claims  = other_claims  * allw_amt

bys mbr_sys_id year cust_seg_nbr: egen number_inpatient     = sum(inpatient)
bys mbr_sys_id year cust_seg_nbr: egen number_outpatient    = sum(outpatient)
bys mbr_sys_id year cust_seg_nbr: egen number_specialty     = sum(specialty)
bys mbr_sys_id year cust_seg_nbr: egen number_other_claims  = sum(other_claims)

********************************************************************************************


gen oop_dircomputed = coins_amt + copay_amt + ded_amt
gen opp_indircomputed = allw_amt - net_pd_amt 

forvalues i = 1/5 {
  gen codeAMI`i' = substr(diag_`i'_cd,1,4)
  gen codeDIABETES`i' = substr(diag_`i'_cd,1,3)
  local icd "410 4101 4102 4103 4104 4105 4108 4109"
     foreach x of local icd {
     gen dummyAMI_`i'_`x' = (codeAMI`i' =="`x'")
     }
  gen dummyDIABETES_`i' = (codeDIABETES`i' =="250")
  egen dummyAMI_`i'      = rowtotal(dummyAMI_`i'_*)
}

gen code_ami = 1 if dummyAMI_1 ==1 | dummyAMI_2==1 | dummyAMI_3==1 |dummyAMI_4==1 | dummyAMI_5==1   
replace code_ami=0 if code_ami==.
gen code_diabetes = 1 if dummyDIABETES_1 ==1 | dummyDIABETES_2==1 | dummyDIABETES_3==1 | dummyDIABETES_4==1 | dummyDIABETES_5==1   
replace code_diabetes=0  if code_diabetes==.

gen mem_died = (dschrg_sts_cd=="20")

gen cd2_d1 = substr(diag_1_cd,1,2)
gen cd3_d1 = substr(diag_1_cd,1,3)
gen cd4_d1 = substr(diag_1_cd,1,4)
gen cd5_d1 = substr(diag_1_cd,1,5)
gen cd2_d2 = substr(diag_2_cd,1,2)
gen cd3_d2 = substr(diag_2_cd,1,3)
gen cd4_d2 = substr(diag_2_cd,1,4)
gen cd5_d2 = substr(diag_2_cd,1,5)

cd "C:\Users\sfleit2\Documents\ReclassificationRisk_DoFiles\"

sort cd3_d1
merge cd3_d1 using Cancer3Digits.dta 
drop _merge
sort cd4_d1
merge cd4_d1 using Cancer4Digits.dta 
drop _merge
sort cd5_d1
merge cd5_d1 using Cancer5Digits.dta 
drop _merge

egen code_cancer2 = rowmax(cancer3digits cancer4digits cancer5digits)
replace code_cancer2=0 if code_cancer2!=1

gen code_leukemia = 1 if cd3_d1=="205" | cd3_d2=="205"
replace  code_leukemia=0 if  code_leukemia==.

gen code_septicemia = 1 if cd4_d1=="0389" | cd4_d2=="0389"
replace  code_septicemia=0 if  code_septicemia==.

gen code_coronaryatheros = 1 if cd3_d1=="414" | cd3_d2=="414"
replace  code_coronaryatheros=0 if  code_coronaryatheros==.

gen code_heartvalve = 1 if cd3_d1=="424" | cd3_d2=="424"
replace  code_heartvalve=0 if  code_heartvalve==.

gen code_brainhemorr = 1 if cd3_d1=="430" | cd3_d2=="430"
replace  code_brainhemorr=0 if  code_brainhemorr==.

gen code_respfailure = 1 if cd5_d1=="51881" | cd5_d2=="51881"
replace  code_respfailure=0 if  code_respfailure==.

gen code_sepshock = 1 if cd5_d1=="78552" | cd5_d2=="78552"
replace  code_sepshock=0 if  code_sepshock==.

gen code_respfainewborn = 1 if cd5_d1=="77084" | cd5_d2=="77084"
replace  code_respfainewborn=0 if  code_respfainewborn==.

gen code_respdistrnewborn = 1 if cd3_d1=="769" | cd3_d2=="769"
replace  code_respdistrnewborn=0 if  code_respdistrnewborn==.

gen code_kidneyfail = 1 if cd3_d1=="584" | cd3_d2=="584"
replace  code_kidneyfail=0 if  code_kidneyfail==.

gen code_respfaisurg = 1 if cd5_d1=="51851" | cd5_d2=="51851"
replace  code_respfaisurg=0 if  code_respfaisurg==.

gen code_neumonia = 1 if cd3_d1=="486" | cd3_d2=="486"
replace  code_neumonia=0 if  code_neumonia==.

gen code_brainedema = 1 if cd4_d1=="3485" | cd4_d2=="3485"
replace  code_brainedema=0 if  code_brainedema==.

gen code_pancytopenia = 1 if cd5_d1=="28411" | cd5_d2=="28411"
replace  code_pancytopenia=0 if  code_pancytopenia==.

gen code_posthemorranemia = 1 if cd4_d1=="2851" | cd4_d2=="2851"
replace  code_posthemorranemia=0 if  code_posthemorranemia==.

gen code_traubraininj = 1 if cd3_d1=="854" | cd3_d2=="854"
replace  code_traubraininj=0 if  code_traubraininj==.

gen code_transplant = 1 if cd3_d1=="V42" | cd3_d2=="V42"
replace  code_transplant=0 if  code_transplant==.


*new codes
gen code_asthma = 1 if cd3_d1=="493" | cd3_d2=="493"
replace  code_asthma=0 if  code_asthma==.

gen code_atrialfibrilization = 1 if cd3_d1=="427" | cd3_d2=="427"
replace  code_atrialfibrilization=0 if  code_atrialfibrilization==.

gen code_chronickidneydisease = 1 if cd3_d1=="585" | cd3_d2=="585"
replace  code_chronickidneydisease=0 if  code_chronickidneydisease==.

gen code_heartfailure = 1 if cd3_d1=="428" | cd3_d2=="428"
replace  code_heartfailure=0 if  code_heartfailure==.

gen code_hypertension = 1 if cd3_d1=="401" | cd3_d2=="401" |cd3_d1=="402" | cd3_d2=="402" | cd3_d1=="403" | cd3_d2=="403" | cd3_d1=="404" | cd3_d2=="404" | cd3_d1=="405" | cd3_d2=="405"
replace  code_hypertension=0 if  code_hypertension==.


*fixing cancer
gen code_cancer = 1 if cd2_d1=="14" | cd2_d1=="15" | cd2_d1=="16" | cd2_d1=="17" | cd2_d1=="18" | cd2_d1=="19" | cd2_d1=="20" 
*| cd2_d1=="21" | cd2_d1=="22" | cd2_d1=="23" 
replace  code_cancer=0 if  code_cancer==.

gen code_cancer_b = 1 if cd2_d1=="14" | cd2_d1=="15" | cd2_d1=="16" | cd2_d1=="17" | cd2_d1=="18" | cd2_d1=="19" | cd2_d1=="20" | cd2_d2=="14" | cd2_d2=="15" | cd2_d2=="16" | cd2_d2=="17" | cd2_d2=="18" | cd2_d2=="19" | cd2_d2=="20"
*| cd2_d1=="21" | cd2_d1=="22" | cd2_d1=="23" 
replace  code_cancer_b=0 if  code_cancer_b==.



collapse (max) number_inpatient number_outpatient number_specialty number_other_claims code_cancer_b code_hypertension code_heartfailure code_chronickidneydisease code_atrialfibrilization code_asthma code_cancer code_cancer2 code_traubraininj code_transplant code_leukemia code_septicemia code_coronaryatheros code_heartvalve code_brainhemorr code_respfailure code_sepshock code_respfainewborn code_respdistrnewborn code_kidneyfail code_respfaisurg code_neumonia code_brainedema code_pancytopenia code_posthemorranemia mem_died code_ami code_diabetes (sum) dollar_claims_inpatient dollar_claims_outpatient dollar_claims_specialty dollar_claims_other_claims oop_dircomputed net_pd_amt allw_amt (firstnm) rel_cd  , by(mbr_sys_id year cust_seg_nbr )
tempfile aux_claims`j'
save `aux_claims`j''
}

use `aux_claims1'

forvalues j = 2/8 {
append using `aux_claims`j''
}

collapse (max) number_inpatient number_outpatient number_specialty number_other_claims code_cancer_b code_hypertension code_heartfailure code_chronickidneydisease code_atrialfibrilization code_asthma code_cancer code_cancer2 code_traubraininj code_transplant code_leukemia code_septicemia code_coronaryatheros code_heartvalve code_brainhemorr code_respfailure code_sepshock code_respfainewborn code_respdistrnewborn code_kidneyfail code_respfaisurg code_neumonia code_brainedema code_pancytopenia code_posthemorranemia mem_died code_ami code_diabetes (sum) dollar_claims_inpatient dollar_claims_outpatient dollar_claims_specialty dollar_claims_other_claims oop_dircomputed net_pd_amt allw_amt (firstnm) rel_cd  , by(mbr_sys_id year cust_seg_nbr )
gen customer_number = substr( cust_seg_nbr ,3,7)
sort year mbr_sys_id customer_number
rename customer_number polnbr
cd "C:\Users\sfleit2\Documents\ReclassificationRisk_DoFiles\"
save total_claims_20122013_RandomForest.dta, replace 
************************************************************************************************* 
************************************************************************************************* 





************************************************************************************************* 
************************************************************************************************* 
**** PHARMACY CLAIMS DATABASE  
************************************************************************************************* 
clear all
set more off
set matsize 11000
cd "C:\Users\sfleit2\Documents\ReclassificationRisk_DoFiles\"
********** create temp files per each of the claims data 
cd "C:\PlanChoice\"
use mc_2015_rxclaims.dta 


sort ndc 
merge ndc using "C:\Users\sfleit2\Documents\ReclassificationRisk_DoFiles\NDCtoCLASSES.dta"
drop if _merge==2
egen drugs_classes_drugs = group(drug_class)

forvalues i =1/133{
gen aux1 = 1 if drugs_classes_drugs==`i'
replace aux1=0 if aux==.
gen aux2 = pd_amt if drugs_classes_drugs==`i'
replace aux2=0 if aux2==.
bys mbr_sys_id year cust_seg_nbr: egen drugclass_number_`i' = sum(aux1) 
bys mbr_sys_id year cust_seg_nbr: egen drugclass_dollar_`i' = sum(aux2) 
drop aux1 aux2
}


********************************
egen code_drugs = group(ndc)
bys mbr_sys_id year cust_seg_nbr: egen num_drugs = count(code_drugs)
********************************

collapse (max) drugclass_dollar_* drugclass_number_* num_drugs (sum) copay_amt ded_amt pd_amt sbmt_chrg_amt , by(mbr_sys_id year cust_seg_nbr )
gen customer_number = substr( cust_seg_nbr ,3,7)
drop cust_seg_nbr
rename customer_number polnbr
sort year mbr_sys_id polnbr
cd "C:\Users\sfleit2\Documents\ReclassificationRisk_DoFiles\"
order year mbr_sys_id polnbr sbmt_chrg_amt copay_amt ded_amt pd_amt drugclass_dollar_* drugclass_number_*
save total_pharmaclaims_20122013_RandomForest.dta, replace 
*************************************************************************************************


************************************************************************************************* 
************************************************************************************************* 
*SOLUTIONS CLAIMS DATABASE  
************************************************************************************************* 
clear all
set more off
set matsize 11000
********** create temp files per each of the claims data 
cd "C:\PlanChoice\"
use mc_2015_rxclaims_sol.dta

sort ndc 
merge ndc using "C:\Users\sfleit2\Documents\ReclassificationRisk_DoFiles\NDCtoCLASSES.dta"
drop if _merge==2
egen drugs_classes_drugs = group(drug_class)

forvalues i =1/133{
gen aux1 = 1 if drugs_classes_drugs==`i'
replace aux1=0 if aux==.
gen aux2 = pd_amt if drugs_classes_drugs==`i'
replace aux2=0 if aux2==.
bys mbr_sys_id year cust_seg_nbr: egen drugclass_number_`i' = sum(aux1) 
bys mbr_sys_id year cust_seg_nbr: egen drugclass_dollar_`i' = sum(aux2) 
drop aux1 aux2
}


********************************
egen code_drugs = group(ndc)
bys mbr_sys_id year cust_seg_nbr: egen num_drugs = count(code_drugs)
********************************

collapse (max) drugclass_dollar_* drugclass_number_*  num_drugs (sum) sbmt_chrg_amt copay_amt ded_amt pd_amt , by(mbr_sys_id year cust_seg_nbr )
gen customer_number = substr( cust_seg_nbr ,3,7)
drop cust_seg_nbr
rename customer_number polnbr
sort year mbr_sys_id polnbr
cd "C:\Users\sfleit2\Documents\ReclassificationRisk_DoFiles\"
order year mbr_sys_id polnbr sbmt_chrg_amt copay_amt ded_amt pd_amt drugclass_dollar_* drugclass_number_*
save total_solutionsclaims_20122013_RandomForest.dta, replace 
*************************************************************************************************

*************************************************************************************************
** APPEND PHARMA CLAIMS
*************************************************************************************************
cd "C:\Users\sfleit2\Documents\ReclassificationRisk_DoFiles\"
use total_pharmaclaims_20122013_RandomForest.dta, clear
append using total_solutionsclaims_20122013_RandomForest.dta
collapse (max) drugclass_dollar_* drugclass_number_* num_drugs (sum) sbmt_chrg_amt copay_amt ded_amt pd_amt , by(mbr_sys_id year polnbr )
sort year mbr_sys_id polnbr
save total_drugclaims_20122013_RandomForest.dta, replace 
*************************************************************************************************

exit

*************************************************************************************************
*************************************************************************************************
*************************************************************************************************
****************** HERE UNITING THE CLAIMS FROM 2014 AND PREVIOUS YEARS 
*************************************************************************************************
*************************************************************************************************
*************************************************************************************************
clear all 
use total_claims_20122013_RandomForest.dta, clear
drop if year==2014
append using total_claims_2014_RandomForest.dta
append using total_claims_2015_RandomForest.dta
sort year mbr_sys_id polnbr
save total_claims_RandomForest.dta, replace
*************************************************************************************************
clear all 
use total_drugclaims_20122013_RandomForest.dta, clear
drop if year==2014
append using total_pharmaclaims_2014_RandomForest.dta
append using total_pharmaclaims_2015_RandomForest.dta
sort year mbr_sys_id polnbr 
save total_pharmaclaims_RandomForest.dta, replace
*************************************************************************************************

exit


*************************************************************************************************
*************************************************************************************************
*************************************************************************************************
****************** PATIENTS DATABASE
*************************************************************************************************
*************************************************************************************************
*************************************************************************************************
cd "C:\Users\sfleit2\Documents\ReclassificationRisk_DoFiles\"
forvalues i=2/5 {
clear  
insheet using patient_year201`i'_forRandomForest.csv, comma
rename patient_id mbr_sys_id
gen year=201`i' 
save aux_year201`i', replace
}
use aux_year2012, clear
append using aux_year2013
append using aux_year2014
append using aux_year2015
sort mbr_sys_id year
save aux_patients_sexgender.dta, replace


exit



*************************************************************************************************
*************************************************************************************************
****************** MERGE EVERYTHING AND CREATE FILE FOR R *************************************************************************************************
clear  
cd "C:\Users\sfleit2\Documents\ReclassificationRisk_DoFiles\"
************
set more off
use total_pharmaclaims_RandomForest.dta
merge year mbr_sys_id polnbr using total_claims_RandomForest.dta
sort  polnbr year
drop if year!=2012 &  year!=2013 &  year!=2014 &  year!=2015
drop _merge
*merge polnbr using firmsandmarkets.dta
*drop if _merge!=3
*drop if pd_amt==.
*drop if allw_amt==.
*drop _merge
sort  mbr_sys_id year
merge mbr_sys_id year using aux_patients_sexgender.dta
drop if _merge==1
drop _merge
order year mbr_sys_id polnbr cust_seg_nbr rel_cd age sex customer_number minor_market
foreach var of varlist drugclass_dollar_1-allw_amt {
replace `var'=0 if `var'==.
}
gen dollar_drugs = copay_amt + pd_amt
gen total_expenditure = copay_amt + pd_amt + allw_amt
xtset mbr_sys_id year
gen total_future_expenditure = F.total_expenditure
*sort customer_number
*save aux_to_drop_now.dta
*use firmsandmarkets.dta, clear 
*sort customer_number
*merge customer_number using aux_to_drop_now.dta
*exit
*drop if _merge!=3
*drop _merge
gen id=_n
tab minor_market, gen(dummarket_)
save data_filetoR_aux.dta, replace 
keep id year mbr_sys_id total_future_expenditure dummarket_* drugclass_dollar_* drugclass_number_* age sex dollar_drugs dollar_claims_inpatient dollar_claims_outpatient dollar_claims_specialty dollar_claims_other_claims num_drugs number_inpatient number_outpatient number_specialty number_other_claims code_* 
outsheet using filetoR_randomforest_predict_vfixed.csv, comma replace
drop if year!=2012
drop if total_future_expenditure==.
keep total_future_expenditure dummarket_* drugclass_dollar_* drugclass_number_* age sex dollar_drugs dollar_claims_inpatient dollar_claims_outpatient dollar_claims_specialty dollar_claims_other_claims num_drugs number_inpatient number_outpatient number_specialty number_other_claims code_* 
outsheet using filetoR_randomforest_2012_vfixed.csv, comma replace
use data_filetoR_aux.dta, clear
drop if year!=2013
drop if total_future_expenditure==.
keep total_future_expenditure dummarket_* drugclass_dollar_* drugclass_number_* age sex dollar_drugs dollar_claims_inpatient dollar_claims_outpatient dollar_claims_specialty dollar_claims_other_claims num_drugs number_inpatient number_outpatient number_specialty number_other_claims code_* 
outsheet using filetoR_randomforest_2013_vfixed.csv, comma replace
use data_filetoR_aux.dta, clear
drop if year!=2014
drop if total_future_expenditure==.
keep total_future_expenditure dummarket_* drugclass_dollar_* drugclass_number_* age sex dollar_drugs dollar_claims_inpatient dollar_claims_outpatient dollar_claims_specialty dollar_claims_other_claims num_drugs number_inpatient number_outpatient number_specialty number_other_claims code_* 
outsheet using filetoR_randomforest_2014_vfixed.csv, comma replace
*************************************************************************************************


